Search in sources :

Example 6 with VCFFilterHeaderLine

use of htsjdk.variant.vcf.VCFFilterHeaderLine in project jvarkit by lindenb.

the class VcfUcsc method doVcfToVcf.

@Override
protected int doVcfToVcf(final String inputName, final VcfIterator in, final VariantContextWriter out) {
    final String TAG;
    if (StringUtil.isBlank(this.infoTag)) {
        TAG = "UCSC_" + database.toUpperCase() + "_" + table.toUpperCase();
    } else {
        TAG = this.infoTag;
    }
    VCFHeader header = in.getHeader();
    final VCFHeader h2 = new VCFHeader(header);
    h2.addMetaDataLine(new VCFInfoHeaderLine(TAG, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, database + "." + table));
    if (!StringUtil.isBlank(this.filterIn)) {
        h2.addMetaDataLine(new VCFFilterHeaderLine(this.filterIn, "Set by " + this.getClass().getName()));
    } else if (!StringUtil.isBlank(this.filterOut)) {
        h2.addMetaDataLine(new VCFFilterHeaderLine(this.filterOut, "Set by " + this.getClass().getName()));
    }
    h2.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "CmdLine", String.valueOf(getProgramCommandLine())));
    h2.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "Version", String.valueOf(getVersion())));
    out.writeHeader(h2);
    final Map<Integer, PreparedStatement> bin2pstmt = new HashMap<>();
    try {
        if (!this.has_bin_column) {
            bin2pstmt.put(0, createPreparedStatement(0));
        }
        final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header).logger(LOG);
        while (in.hasNext()) {
            final VariantContext ctx = progress.watch(in.next());
            int start0, end0;
            if (// mutation starts *after* the base
            ctx.isIndel()) {
                start0 = ctx.getStart();
                end0 = ctx.getEnd();
            } else {
                start0 = ctx.getStart() - 1;
                end0 = ctx.getEnd();
            }
            // extends left/right
            start0 = Math.max(0, start0 - this.extend_bases);
            end0 += this.extend_bases;
            final Set<String> atts = new HashSet<String>();
            if (this.has_bin_column) {
                final List<Integer> binList = reg2bins(start0, end0);
                PreparedStatement pstmt = bin2pstmt.get(binList.size());
                if (pstmt == null) {
                    LOG.debug("create prepared statemement for bin.size=" + binList.size() + "[" + start0 + ":" + end0 + "]");
                    pstmt = createPreparedStatement(binList.size());
                    bin2pstmt.put(binList.size(), pstmt);
                }
                initPstmt(pstmt, ctx.getContig(), start0, end0);
                for (int x = 0; x < binList.size(); ++x) {
                    pstmt.setInt(4 + x, binList.get(x));
                }
                select(atts, pstmt);
            } else {
                // already defined
                final PreparedStatement pstmt = bin2pstmt.get(0);
                initPstmt(pstmt, ctx.getContig(), start0, end0);
                select(atts, pstmt);
            }
            if (atts.isEmpty() && StringUtil.isBlank(this.filterIn) && StringUtil.isBlank(this.filterOut)) {
                out.add(ctx);
                continue;
            }
            final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
            if (!StringUtil.isBlank(this.filterIn) && !atts.isEmpty()) {
                vcb.filter(this.filterIn);
            } else if (!StringUtil.isBlank(this.filterOut) && atts.isEmpty()) {
                vcb.filter(this.filterOut);
            }
            if (!atts.isEmpty()) {
                vcb.attribute(TAG, atts.toArray());
            }
            out.add(vcb.make());
        }
        progress.finish();
        return 0;
    } catch (final SQLException err) {
        LOG.error(err);
        throw new RuntimeException("SQLError", err);
    } finally {
        for (final PreparedStatement pstmt : bin2pstmt.values()) {
            CloserUtil.close(pstmt);
        }
        bin2pstmt.clear();
    }
}
Also used : VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) HashMap(java.util.HashMap) SQLException(java.sql.SQLException) VariantContext(htsjdk.variant.variantcontext.VariantContext) PreparedStatement(java.sql.PreparedStatement) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet)

Example 7 with VCFFilterHeaderLine

use of htsjdk.variant.vcf.VCFFilterHeaderLine in project jvarkit by lindenb.

the class VCFComm method doWork.

@Override
public int doWork(final List<String> args) {
    CloseableIterator<LineAndFile> iter = null;
    SortingCollection<LineAndFile> variants = null;
    VariantContextWriter w = null;
    try {
        if (args.isEmpty()) {
            LOG.error("Illegal number of arguments");
            return -1;
        }
        Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
        variants = SortingCollection.newInstance(LineAndFile.class, new LineAndFileCodec(), new LineAndFileComparator(), super.sortingCollectionArgs.getMaxRecordsInRam(), super.sortingCollectionArgs.getTmpPaths());
        variants.setDestructiveIteration(true);
        /**
         * new sample names in the output vcf: one  sample per file
         */
        final Map<Integer, String> fileid2sampleName = new TreeMap<>();
        /**
         * samples names as they appear in the original VCF headers
         */
        final Counter<String> countInputSamples = new Counter<String>();
        /**
         * dicts
         */
        final List<SAMSequenceDictionary> all_dictionaries = new ArrayList<>();
        for (final String vcffilename : IOUtils.unrollFiles(args)) {
            LOG.info("Reading from " + vcffilename);
            final Input input = super.put(variants, vcffilename);
            String sampleName = vcffilename;
            if (sampleName.endsWith(".vcf.gz")) {
                sampleName = sampleName.substring(0, sampleName.length() - 7);
            } else if (sampleName.endsWith(".vcf.gz")) {
                sampleName = sampleName.substring(0, sampleName.length() - 4);
            }
            int slash = sampleName.lastIndexOf(File.separatorChar);
            if (slash != -1)
                sampleName = sampleName.substring(slash + 1);
            int suffix = 1;
            // loop until we find a uniq name
            for (; ; ) {
                final String key = sampleName + (suffix == 1 ? "" : "_" + suffix);
                if (fileid2sampleName.values().contains(key)) {
                    suffix++;
                    continue;
                }
                fileid2sampleName.put(input.file_id, key);
                metaData.add(new VCFHeaderLine(key, vcffilename));
                break;
            }
            for (final String sname : input.codecAndHeader.header.getSampleNamesInOrder()) {
                countInputSamples.incr(sname);
            }
            all_dictionaries.add(input.codecAndHeader.header.getSequenceDictionary());
        }
        variants.doneAdding();
        /**
         * unique sample name, if any present in all VCF
         */
        Optional<String> unqueSampleName = Optional.empty();
        if (countInputSamples.getCountCategories() == 1 && countInputSamples.count(countInputSamples.keySet().iterator().next()) == fileid2sampleName.size()) {
            unqueSampleName = Optional.of(countInputSamples.keySet().iterator().next());
            LOG.info("Unique sample name is " + unqueSampleName.get());
        }
        VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.DEPTH_KEY, VCFConstants.GENOTYPE_QUALITY_KEY, VCFConstants.GENOTYPE_KEY, VCFConstants.GENOTYPE_FILTER_KEY);
        VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.DEPTH_KEY, VCFConstants.ALLELE_COUNT_KEY, VCFConstants.ALLELE_NUMBER_KEY);
        metaData.add(new VCFHeaderLine(getClass().getSimpleName(), "version:" + getVersion() + " command:" + getProgramCommandLine()));
        final VCFFilterHeaderLine variantNotCalledInAllVcf = new VCFFilterHeaderLine("NotCalledEveryWhere", "Variant was NOT called in all input VCF");
        metaData.add(variantNotCalledInAllVcf);
        final VCFFilterHeaderLine variantWasFiltered = new VCFFilterHeaderLine("VariantWasFiltered", "At least one variant was filtered");
        metaData.add(variantWasFiltered);
        final VCFFormatHeaderLine variantQUALFormat = new VCFFormatHeaderLine("VCQUAL", 1, VCFHeaderLineType.Float, "Variant Quality");
        metaData.add(variantQUALFormat);
        metaData.add(new VCFFormatHeaderLine(VCFConstants.ALLELE_NUMBER_KEY, 1, VCFHeaderLineType.Integer, "Number of allle in the src vcf"));
        metaData.add(new VCFFormatHeaderLine(VCFConstants.ALLELE_COUNT_KEY, 1, VCFHeaderLineType.Integer, "Number of ALT alllele"));
        final VCFInfoHeaderLine foundInCountVcfInfo = new VCFInfoHeaderLine("NVCF", 1, VCFHeaderLineType.Integer, "Number of VCF this variant was found");
        metaData.add(foundInCountVcfInfo);
        final VCFInfoHeaderLine variantTypesInfo = new VCFInfoHeaderLine("VTYPES", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Distinct Variants type");
        metaData.add(variantTypesInfo);
        final VCFFilterHeaderLine multipleTypeFilters = new VCFFilterHeaderLine("DiscordantTypes", "Discordant types at this position");
        metaData.add(multipleTypeFilters);
        final VCFFormatHeaderLine variantTypeFormat = new VCFFormatHeaderLine("VTYPE", 1, VCFHeaderLineType.String, "Variant Type");
        metaData.add(variantTypeFormat);
        final VCFFilterHeaderLine uniqueVariantDiscordantGTFilter;
        if (unqueSampleName.isPresent()) {
            metaData.add(new VCFHeaderLine("UniqSample", unqueSampleName.get()));
            uniqueVariantDiscordantGTFilter = new VCFFilterHeaderLine("DiscordantGenotypeForUniqSample", "Genotype Dicordant for for sample " + unqueSampleName.get());
            metaData.add(uniqueVariantDiscordantGTFilter);
        } else {
            uniqueVariantDiscordantGTFilter = null;
        }
        final VCFHeader header = new VCFHeader(metaData, new ArrayList<>(fileid2sampleName.values()));
        if (// all have a dict
        !normalize_chr && !all_dictionaries.contains(null)) {
            SAMSequenceDictionary thedict = null;
            for (int x = 0; x < all_dictionaries.size(); ++x) {
                SAMSequenceDictionary d = all_dictionaries.get(x);
                if (thedict == null) {
                    thedict = d;
                } else if (!SequenceUtil.areSequenceDictionariesEqual(d, thedict)) {
                    thedict = null;
                    break;
                }
            }
            if (thedict != null)
                header.setSequenceDictionary(thedict);
        }
        w = super.openVariantContextWriter(super.outputFile);
        w.writeHeader(header);
        final List<LineAndFile> row = new ArrayList<LineAndFile>(super.inputs.size());
        final Comparator<LineAndFile> posCompare = (A, B) -> A.getContigPosRef().compareTo(B.getContigPosRef());
        iter = variants.iterator();
        for (; ; ) {
            LineAndFile rec = null;
            if (iter.hasNext()) {
                rec = iter.next();
            }
            if (rec == null || (!row.isEmpty() && posCompare.compare(row.get(0), rec) != 0)) {
                if (!row.isEmpty()) {
                    final VariantContext first = row.get(0).getContext();
                    /* in which file id we find this variant */
                    Set<Integer> fileids_for_variant = row.stream().map(LAF -> LAF.fileIdx).collect(Collectors.toSet());
                    // see with HAS multiple chrom/pos/ref but different alt
                    if (row.size() != fileids_for_variant.size()) {
                        for (; ; ) {
                            boolean ok = true;
                            for (int x = 0; ok && x + 1 < row.size(); ++x) {
                                final VariantContext ctxx = row.get(x).getContext();
                                final List<Allele> altsx = ctxx.getAlternateAlleles();
                                for (int y = x + 1; ok && y < row.size(); ++y) {
                                    if (row.get(x).fileIdx != row.get(y).fileIdx)
                                        continue;
                                    final VariantContext ctxy = row.get(y).getContext();
                                    final List<Allele> altsy = ctxy.getAlternateAlleles();
                                    if (altsx.equals(altsy))
                                        continue;
                                    if (!ctxx.isVariant() && ctxy.isVariant()) {
                                        row.remove(x);
                                    } else if (ctxx.isVariant() && !ctxy.isVariant()) {
                                        row.remove(y);
                                    } else if (!ctxx.isSNP() && ctxy.isSNP()) {
                                        row.remove(x);
                                    } else if (ctxx.isSNP() && !ctxy.isSNP()) {
                                        row.remove(y);
                                    } else if (altsx.size() > altsy.size()) {
                                        row.remove(x);
                                    } else if (altsx.size() < altsy.size()) {
                                        row.remove(y);
                                    } else {
                                        row.remove(y);
                                    }
                                    ok = false;
                                    break;
                                }
                            }
                            if (ok)
                                break;
                        }
                        fileids_for_variant = row.stream().map(LAF -> LAF.fileIdx).collect(Collectors.toSet());
                    }
                    if (row.size() != fileids_for_variant.size()) {
                        LOG.error("There are some duplicated variants at the position " + new ContigPosRef(first) + " in the same vcf file");
                        for (final LineAndFile laf : row) {
                            LOG.error("File [" + laf.fileIdx + "]" + fileid2sampleName.get(laf.fileIdx));
                            LOG.error("\t" + laf.getContigPosRef());
                        }
                        row.clear();
                    } else {
                        final Set<Allele> alleles = row.stream().flatMap(R -> R.getContext().getAlleles().stream()).collect(Collectors.toSet());
                        final VariantContextBuilder vcb = new VariantContextBuilder(getClass().getName(), first.getContig(), first.getStart(), first.getEnd(), alleles);
                        final Set<String> filters = new HashSet<>();
                        final Set<VariantContext.Type> variantContextTypes = new HashSet<>();
                        final List<Genotype> genotypes = new ArrayList<Genotype>();
                        for (final LineAndFile laf : row) {
                            if (laf.getContext().isFiltered())
                                filters.add(variantWasFiltered.getID());
                            variantContextTypes.add(laf.getContext().getType());
                            final GenotypeBuilder gbuilder = new GenotypeBuilder();
                            gbuilder.name(fileid2sampleName.get(laf.fileIdx));
                            if (unqueSampleName.isPresent()) {
                                final Genotype g0 = laf.getContext().getGenotype(unqueSampleName.get());
                                if (g0 == null) {
                                    iter.close();
                                    w.close();
                                    throw new IllegalStateException("Cannot find genotype for " + unqueSampleName.get());
                                }
                                if (g0.hasDP())
                                    gbuilder.DP(g0.getDP());
                                if (g0.hasGQ())
                                    gbuilder.GQ(g0.getGQ());
                                gbuilder.alleles(g0.getAlleles());
                            } else {
                                gbuilder.alleles(Arrays.asList(first.getReference(), first.getReference()));
                                if (laf.getContext().hasAttribute(VCFConstants.DEPTH_KEY)) {
                                    gbuilder.DP(laf.getContext().getAttributeAsInt(VCFConstants.DEPTH_KEY, 0));
                                }
                            }
                            if (laf.getContext().isFiltered()) {
                                gbuilder.filter("VCFFILTERED");
                            }
                            if (laf.getContext().hasLog10PError()) {
                                gbuilder.attribute(variantQUALFormat.getID(), laf.getContext().getPhredScaledQual());
                            }
                            gbuilder.attribute(VCFConstants.ALLELE_NUMBER_KEY, laf.getContext().getGenotypes().stream().flatMap(G -> G.getAlleles().stream()).filter(A -> !A.isNoCall()).count());
                            gbuilder.attribute(VCFConstants.ALLELE_COUNT_KEY, laf.getContext().getGenotypes().stream().flatMap(G -> G.getAlleles().stream()).filter(A -> !(A.isReference() || A.isNoCall())).count());
                            gbuilder.attribute(variantTypeFormat.getID(), laf.getContext().getType().name());
                            genotypes.add(gbuilder.make());
                        }
                        final String id = String.join(";", row.stream().map(LAF -> LAF.getContext()).filter(V -> V.hasID()).map(V -> V.getID()).collect(Collectors.toSet()));
                        if (!id.isEmpty())
                            vcb.id(id);
                        vcb.genotypes(genotypes);
                        if (unqueSampleName.isPresent()) {
                            boolean all_same = true;
                            for (int x = 0; all_same && x + 1 < genotypes.size(); ++x) {
                                if (!genotypes.get(x).isCalled())
                                    continue;
                                for (int y = x + 1; all_same && y < genotypes.size(); ++y) {
                                    if (!genotypes.get(y).isCalled())
                                        continue;
                                    if (!genotypes.get(x).sameGenotype(genotypes.get(y), true)) {
                                        all_same = false;
                                        break;
                                    }
                                }
                            }
                            if (!all_same)
                                filters.add(uniqueVariantDiscordantGTFilter.getID());
                        }
                        // Add AN
                        vcb.attribute(VCFConstants.ALLELE_NUMBER_KEY, genotypes.stream().filter(G -> G.isCalled()).mapToInt(G -> G.getAlleles().size()).sum());
                        if (!variantContextTypes.isEmpty()) {
                            vcb.attribute(variantTypesInfo.getID(), new ArrayList<>(variantContextTypes.stream().map(T -> T.name()).collect(Collectors.toSet())));
                            if (variantContextTypes.size() > 1) {
                                filters.add(multipleTypeFilters.getID());
                            }
                        }
                        vcb.attribute(foundInCountVcfInfo.getID(), fileids_for_variant.size());
                        boolean print = true;
                        if (row.size() == super.inputs.size() && ignore_everywhere) {
                            print = false;
                        }
                        if (fileids_for_variant.size() != fileid2sampleName.size()) {
                            filters.add(variantNotCalledInAllVcf.getID());
                            if (only_everywhere) {
                                print = false;
                            }
                        }
                        vcb.filters(filters);
                        if (print) {
                            w.add(vcb.make());
                        }
                    }
                    row.clear();
                }
                if (rec == null)
                    break;
            }
            row.add(rec);
        }
        iter.close();
        iter = null;
        w.close();
        w = null;
        return 0;
    } catch (Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(iter);
        CloserUtil.close(w);
        try {
            if (variants != null)
                variants.cleanup();
        } catch (Exception err) {
        }
    }
}
Also used : Genotype(htsjdk.variant.variantcontext.Genotype) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) CloseableIterator(htsjdk.samtools.util.CloseableIterator) Allele(htsjdk.variant.variantcontext.Allele) Arrays(java.util.Arrays) SequenceUtil(htsjdk.samtools.util.SequenceUtil) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) VCFStandardHeaderLines(htsjdk.variant.vcf.VCFStandardHeaderLines) VCFHeader(htsjdk.variant.vcf.VCFHeader) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) ContigPosRef(com.github.lindenb.jvarkit.util.vcf.ContigPosRef) Map(java.util.Map) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) CloserUtil(htsjdk.samtools.util.CloserUtil) VCFConstants(htsjdk.variant.vcf.VCFConstants) Counter(com.github.lindenb.jvarkit.util.Counter) SortingCollection(htsjdk.samtools.util.SortingCollection) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Logger(com.github.lindenb.jvarkit.util.log.Logger) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Set(java.util.Set) Collectors(java.util.stream.Collectors) File(java.io.File) List(java.util.List) TreeMap(java.util.TreeMap) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) Optional(java.util.Optional) VariantContext(htsjdk.variant.variantcontext.VariantContext) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) Pattern(java.util.regex.Pattern) Comparator(java.util.Comparator) VCFHeaderLineCount(htsjdk.variant.vcf.VCFHeaderLineCount) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Counter(com.github.lindenb.jvarkit.util.Counter) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) ContigPosRef(com.github.lindenb.jvarkit.util.vcf.ContigPosRef) Genotype(htsjdk.variant.variantcontext.Genotype) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) TreeMap(java.util.TreeMap) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) Allele(htsjdk.variant.variantcontext.Allele) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder)

Example 8 with VCFFilterHeaderLine

use of htsjdk.variant.vcf.VCFFilterHeaderLine in project jvarkit by lindenb.

the class VcfCalledWithAnotherMethod method doVcfToVcf.

protected int doVcfToVcf(final String inputName, final VcfIterator in, final VariantContextWriter out) {
    final List<ExternalVcf> externalVcfs = new ArrayList<>();
    try {
        final VCFHeader header = in.getHeader();
        this.dictionary = header.getSequenceDictionary();
        /**
         * open primitive input
         */
        if (this.dictionary == null) {
            LOG.error("no dictionary in input");
            return -1;
        }
        final Set<File> samtoolsFiles = new HashSet<>();
        this.externalVcfStrs.stream().filter(S -> S.endsWith(".list")).map(S -> Paths.get(S)).forEach(P -> {
            try {
                samtoolsFiles.addAll(Files.readAllLines(P).stream().filter(L -> !(L.startsWith("#") || L.trim().isEmpty())).map(S -> new File(S)).collect(Collectors.toSet()));
            } catch (final Exception err) {
                throw new RuntimeIOException(err);
            }
        });
        samtoolsFiles.addAll(this.externalVcfStrs.stream().filter(S -> !S.endsWith(".list")).map(S -> new File(S)).collect(Collectors.toSet()));
        externalVcfs.addAll(samtoolsFiles.stream().map(F -> new ExternalVcf(F)).collect(Collectors.toList()));
        /**
         * check for uniq keys
         */
        final Set<String> uniqKeys = new HashSet<>();
        for (final ExternalVcf extvcf : externalVcfs) {
            int n = 0;
            for (; ; ) {
                final String newkey = extvcf.key + (n == 0 ? "" : "_" + n);
                if (!uniqKeys.contains(newkey)) {
                    extvcf.key = newkey;
                    uniqKeys.add(newkey);
                    break;
                }
                ++n;
            }
        }
        final VCFHeader h2 = new VCFHeader(header);
        for (final ExternalVcf extvcf : externalVcfs) {
            h2.addMetaDataLine(new VCFHeaderLine("otherVcf:" + extvcf.key, extvcf.vcfFile.getPath()));
        }
        final VCFFilterHeaderLine variantNotFoundElsewhereFILTER = (filterNameVariantNotFoundElseWhere.isEmpty() ? null : new VCFFilterHeaderLine(filterNameVariantNotFoundElseWhere, "Variant Was not found in other VCFs: " + externalVcfs.stream().map(S -> S.vcfFile.getPath()).collect(Collectors.joining(", "))));
        if (variantNotFoundElsewhereFILTER != null) {
            h2.addMetaDataLine(variantNotFoundElsewhereFILTER);
        }
        final VCFInfoHeaderLine variantFoundElseWhereKeys = new VCFInfoHeaderLine(this.infoFoundKey, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Variant was found in the VCFs designed by those keys");
        h2.addMetaDataLine(variantFoundElseWhereKeys);
        final VCFInfoHeaderLine variantFoundElseWhereCount = new VCFInfoHeaderLine(this.infoFoundKey, 1, VCFHeaderLineType.Integer, "Number of times the Variant was found  in the VCFs");
        h2.addMetaDataLine(variantFoundElseWhereCount);
        final VCFFormatHeaderLine genotypeCountSame = new VCFFormatHeaderLine(this.formatCountSame, 1, VCFHeaderLineType.Integer, "Number of times the Genotype was found the same in other VCFs");
        h2.addMetaDataLine(genotypeCountSame);
        final VCFFormatHeaderLine genotypeCountDiscordant = new VCFFormatHeaderLine(this.formatCountDiscordant, 1, VCFHeaderLineType.Integer, "Number of times the Genotype was found dicordant in other VCFs");
        h2.addMetaDataLine(genotypeCountDiscordant);
        super.addMetaData(h2);
        final VariantContextWriter w = super.openVariantContextWriter(outputFile);
        w.writeHeader(h2);
        final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(this.dictionary);
        while (in.hasNext()) {
            final VariantContext ctx = progress.watch(in.next());
            final List<GenotypeCount> genotypeCounts = ctx.getGenotypes().stream().map(G -> new GenotypeCount(G)).collect(Collectors.toList());
            final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
            final Set<String> variantFoundInOtherVcfs = new HashSet<>();
            for (final ExternalVcf extvcf : externalVcfs) {
                final List<VariantContext> otherVariants = extvcf.get(ctx);
                if (otherVariants.stream().filter(CTX2 -> {
                    if (ctx.getAlternateAlleles().isEmpty())
                        return true;
                    for (final Allele a : ctx.getAlternateAlleles()) {
                        if (CTX2.hasAllele(a))
                            return true;
                    }
                    return false;
                }).findAny().isPresent()) {
                    variantFoundInOtherVcfs.add(extvcf.key);
                }
                for (final GenotypeCount gt : genotypeCounts) {
                    for (final VariantContext otherVariant : otherVariants) {
                        final Genotype otherGt = otherVariant.getGenotype(gt.gt.getSampleName());
                        if (otherGt == null)
                            continue;
                        if (gt.gt.sameGenotype(otherGt) || (this.noCallSameAsHomRef && ((gt.gt.isNoCall() && otherGt.isHomRef()) || (gt.gt.isHomRef() && otherGt.isNoCall())))) {
                            gt.countSame++;
                        } else {
                            gt.countDiscordant++;
                        }
                    }
                }
            }
            vcb.genotypes(genotypeCounts.stream().map(G -> {
                final GenotypeBuilder gb = new GenotypeBuilder(G.gt);
                gb.attribute(genotypeCountSame.getID(), G.countSame);
                gb.attribute(genotypeCountDiscordant.getID(), G.countDiscordant);
                return gb.make();
            }).collect(Collectors.toList()));
            vcb.attribute(variantFoundElseWhereCount.getID(), variantFoundInOtherVcfs.size());
            if (variantFoundInOtherVcfs.isEmpty()) {
                if (variantNotFoundElsewhereFILTER != null) {
                    vcb.filter(variantNotFoundElsewhereFILTER.getID());
                }
            } else {
                if (variantNotFoundElsewhereFILTER != null && !ctx.isFiltered()) {
                    vcb.passFilters();
                }
                vcb.attribute(variantFoundElseWhereKeys.getID(), new ArrayList<>(variantFoundInOtherVcfs));
            }
            w.add(vcb.make());
        }
        progress.finish();
        while (!externalVcfs.isEmpty()) externalVcfs.remove(0).close();
        return RETURN_OK;
    } catch (Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        while (!externalVcfs.isEmpty()) externalVcfs.remove(0).close();
    }
}
Also used : Genotype(htsjdk.variant.variantcontext.Genotype) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) CloseableIterator(htsjdk.samtools.util.CloseableIterator) Allele(htsjdk.variant.variantcontext.Allele) VCFUtils(com.github.lindenb.jvarkit.util.vcf.VCFUtils) SequenceUtil(htsjdk.samtools.util.SequenceUtil) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) VCFFileReader(htsjdk.variant.vcf.VCFFileReader) VCFHeader(htsjdk.variant.vcf.VCFHeader) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) Function(java.util.function.Function) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Files(java.nio.file.Files) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) VcfIterator(com.github.lindenb.jvarkit.util.vcf.VcfIterator) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) Collectors(java.util.stream.Collectors) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) File(java.io.File) List(java.util.List) Paths(java.nio.file.Paths) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) VariantContext(htsjdk.variant.variantcontext.VariantContext) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) PeekIterator(htsjdk.samtools.util.PeekIterator) VCFHeaderLineCount(htsjdk.variant.vcf.VCFHeaderLineCount) Comparator(java.util.Comparator) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) Genotype(htsjdk.variant.variantcontext.Genotype) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) Allele(htsjdk.variant.variantcontext.Allele) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) File(java.io.File)

Example 9 with VCFFilterHeaderLine

use of htsjdk.variant.vcf.VCFFilterHeaderLine in project jvarkit by lindenb.

the class IndexCovToVcf method doWork.

@Override
public int doWork(final List<String> args) {
    if (this.deletionTreshold >= 1.0f) {
        LOG.error("Bad deletion treshold >=1.0");
        return -1;
    }
    if (this.duplicationTreshold <= 1.0f) {
        LOG.error("Bad duplication treshold <=1.0");
        return -1;
    }
    if (this.deletionTreshold >= this.duplicationTreshold) {
        LOG.error("Bad tresholds del>=dup");
        return -1;
    }
    final Pattern tab = Pattern.compile("[\t]");
    BufferedReader r = null;
    VariantContextWriter vcw = null;
    try {
        final SAMSequenceDictionary dict;
        if (this.refFile == null) {
            dict = null;
        } else {
            dict = SAMSequenceDictionaryExtractor.extractDictionary(this.refFile);
            if (dict == null) {
                LOG.error("Cannot find dict in " + this.refFile);
                return -1;
            }
        }
        r = super.openBufferedReader(oneFileOrNull(args));
        String line = r.readLine();
        if (line == null) {
            LOG.error("Cannot read first line of input");
            return -1;
        }
        String[] tokens = tab.split(line);
        if (tokens.length < 4 || !tokens[0].equals("#chrom") || !tokens[1].equals("start") || !tokens[2].equals("end")) {
            LOG.error("bad first line " + line);
            return -1;
        }
        final Set<VCFHeaderLine> metaData = new HashSet<>();
        VCFStandardHeaderLines.addStandardFormatLines(metaData, true, "GT");
        VCFStandardHeaderLines.addStandardInfoLines(metaData, true, "END");
        final VCFFormatHeaderLine foldHeader = new VCFFormatHeaderLine("F", 1, VCFHeaderLineType.Float, "Relative number of copy: 0.5 deletion 1 normal 2.0 duplication");
        metaData.add(foldHeader);
        final VCFFormatHeaderLine formatIsDeletion = new VCFFormatHeaderLine("DEL", 1, VCFHeaderLineType.Integer, "set to 1 if relative number of copy <= " + this.deletionTreshold);
        metaData.add(formatIsDeletion);
        final VCFFormatHeaderLine formatIsDuplication = new VCFFormatHeaderLine("DUP", 1, VCFHeaderLineType.Integer, "set to 1 if relative number of copy >= " + this.duplicationTreshold);
        metaData.add(formatIsDuplication);
        final VCFFilterHeaderLine filterAllDel = new VCFFilterHeaderLine("ALL_DEL", "number of samples greater than 1 and all are deletions");
        metaData.add(filterAllDel);
        final VCFFilterHeaderLine filterAllDup = new VCFFilterHeaderLine("ALL_DUP", "number of samples  greater than  1 and all are duplication");
        metaData.add(filterAllDup);
        final VCFFilterHeaderLine filterNoSV = new VCFFilterHeaderLine("NO_SV", "There is no DUP or DEL in this variant");
        metaData.add(filterNoSV);
        final VCFInfoHeaderLine infoNumDup = new VCFInfoHeaderLine("NDUP", 1, VCFHeaderLineType.Integer, "Number of samples being duplicated");
        metaData.add(infoNumDup);
        final VCFInfoHeaderLine infoNumDel = new VCFInfoHeaderLine("NDEL", 1, VCFHeaderLineType.Integer, "Number of samples being deleted");
        metaData.add(infoNumDel);
        final List<String> samples = Arrays.asList(tokens).subList(3, tokens.length);
        final VCFHeader vcfHeader = new VCFHeader(metaData, samples);
        if (dict != null) {
            vcfHeader.setSequenceDictionary(dict);
        }
        vcw = super.openVariantContextWriter(outputFile);
        vcw.writeHeader(vcfHeader);
        // final List<Allele> NO_CALL_NO_CALL = Arrays.asList(Allele.NO_CALL,Allele.NO_CALL);
        final Allele DUP_ALLELE = Allele.create("<DUP>", false);
        final Allele DEL_ALLELE = Allele.create("<DEL>", false);
        final Allele REF_ALLELE = Allele.create("N", true);
        while ((line = r.readLine()) != null) {
            if (StringUtil.isBlank(line))
                continue;
            tokens = tab.split(line);
            if (tokens.length != 3 + samples.size()) {
                throw new JvarkitException.TokenErrors("expected " + (samples.size() + 3) + "columns.", tokens);
            }
            final Set<Allele> alleles = new HashSet<>();
            alleles.add(REF_ALLELE);
            final VariantContextBuilder vcb = new VariantContextBuilder();
            vcb.chr(tokens[0]);
            vcb.start(Integer.parseInt(tokens[1]));
            final int chromEnd = Integer.parseInt(tokens[2]);
            vcb.stop(chromEnd);
            vcb.attribute(VCFConstants.END_KEY, chromEnd);
            if (dict != null) {
                final SAMSequenceRecord ssr = dict.getSequence(tokens[0]);
                if (ssr == null) {
                    LOG.error(JvarkitException.ContigNotFoundInDictionary.getMessage(tokens[0], dict));
                    return -1;
                }
                if (chromEnd > ssr.getSequenceLength()) {
                    LOG.warn("WARNING sequence length in " + line + " is greater than in dictionary ");
                }
            }
            int count_dup = 0;
            int count_del = 0;
            final List<Genotype> genotypes = new ArrayList<>(samples.size());
            for (int i = 3; i < tokens.length; i++) {
                final float f = Float.parseFloat(tokens[i]);
                if (f < 0 || Float.isNaN(f) || !Float.isFinite(f)) {
                    LOG.error("Bad fold " + f + " in " + line);
                }
                final GenotypeBuilder gb;
                if (f <= this.deletionTreshold) {
                    gb = new GenotypeBuilder(samples.get(i - 3), Collections.singletonList(DEL_ALLELE));
                    alleles.add(DEL_ALLELE);
                    gb.attribute(formatIsDeletion.getID(), 1);
                    count_del++;
                } else if (f >= this.duplicationTreshold) {
                    gb = new GenotypeBuilder(samples.get(i - 3), Collections.singletonList(DUP_ALLELE));
                    alleles.add(DUP_ALLELE);
                    gb.attribute(formatIsDuplication.getID(), 1);
                    count_dup++;
                } else {
                    gb = new GenotypeBuilder(samples.get(i - 3), Collections.singletonList(REF_ALLELE));
                    gb.attribute(formatIsDuplication.getID(), 0);
                }
                gb.attribute(foldHeader.getID(), f);
                genotypes.add(gb.make());
            }
            vcb.alleles(alleles);
            vcb.genotypes(genotypes);
            if (count_dup == samples.size() && samples.size() != 1) {
                vcb.filter(filterAllDup.getID());
            }
            if (count_del == samples.size() && samples.size() != 1) {
                vcb.filter(filterAllDel.getID());
            }
            if (count_dup == 0 && count_del == 0) {
                vcb.filter(filterNoSV.getID());
            }
            vcb.attribute(infoNumDel.getID(), count_del);
            vcb.attribute(infoNumDup.getID(), count_dup);
            vcw.add(vcb.make());
        }
        vcw.close();
        vcw = null;
        r.close();
        r = null;
        return 0;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(r);
        CloserUtil.close(vcw);
    }
}
Also used : VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) ArrayList(java.util.ArrayList) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) Pattern(java.util.regex.Pattern) Genotype(htsjdk.variant.variantcontext.Genotype) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) Allele(htsjdk.variant.variantcontext.Allele) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) BufferedReader(java.io.BufferedReader)

Example 10 with VCFFilterHeaderLine

use of htsjdk.variant.vcf.VCFFilterHeaderLine in project gatk-protected by broadinstitute.

the class FilterMutectCalls method onTraversalStart.

@Override
public void onTraversalStart() {
    final VCFHeader inputHeader = getHeaderForVariants();
    final Set<VCFHeaderLine> headerLines = new HashSet<>(inputHeader.getMetaDataInSortedOrder());
    Mutect2FilteringEngine.M_2_FILTER_NAMES.stream().map(GATKVCFHeaderLines::getFilterLine).forEach(headerLines::add);
    headerLines.add(new VCFFilterHeaderLine(Mutect2FilteringEngine.ARTIFACT_IN_NORMAL_FILTER_NAME, "artifact_in_normal"));
    headerLines.add(new VCFFilterHeaderLine(Mutect2FilteringEngine.MEDIAN_BASE_QUALITY_DIFFERENCE_FILTER_NAME, "ref - alt median base quality"));
    headerLines.add(new VCFFilterHeaderLine(Mutect2FilteringEngine.MEDIAN_MAPPING_QUALITY_DIFFERENCE_FILTER_NAME, "ref - alt median mapping quality"));
    headerLines.add(new VCFFilterHeaderLine(Mutect2FilteringEngine.MEDIAN_CLIPPING_DIFFERENCE_FILTER_NAME, "ref - alt median clipping"));
    headerLines.add(new VCFFilterHeaderLine(Mutect2FilteringEngine.MEDIAN_FRAGMENT_LENGTH_DIFFERENCE_FILTER_NAME, "abs(ref - alt) median fragment length"));
    headerLines.add(new VCFFilterHeaderLine(Mutect2FilteringEngine.READ_POSITION_FILTER_NAME, "median distance of alt variants from end of reads"));
    headerLines.add(new VCFFilterHeaderLine(Mutect2FilteringEngine.CONTAMINATION_FILTER_NAME, "contamination"));
    headerLines.addAll(getDefaultToolVCFHeaderLines());
    final VCFHeader vcfHeader = new VCFHeader(headerLines, inputHeader.getGenotypeSamples());
    vcfWriter = createVCFWriter(new File(outputVcf));
    vcfWriter.writeHeader(vcfHeader);
}
Also used : VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeader(htsjdk.variant.vcf.VCFHeader) File(java.io.File)

Aggregations

VCFFilterHeaderLine (htsjdk.variant.vcf.VCFFilterHeaderLine)25 VCFHeader (htsjdk.variant.vcf.VCFHeader)23 VariantContext (htsjdk.variant.variantcontext.VariantContext)17 VariantContextBuilder (htsjdk.variant.variantcontext.VariantContextBuilder)16 VariantContextWriter (htsjdk.variant.variantcontext.writer.VariantContextWriter)15 ArrayList (java.util.ArrayList)15 VCFInfoHeaderLine (htsjdk.variant.vcf.VCFInfoHeaderLine)14 VCFHeaderLine (htsjdk.variant.vcf.VCFHeaderLine)13 File (java.io.File)13 SAMSequenceDictionaryProgress (com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress)12 HashSet (java.util.HashSet)11 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)10 Allele (htsjdk.variant.variantcontext.Allele)10 VCFFormatHeaderLine (htsjdk.variant.vcf.VCFFormatHeaderLine)9 List (java.util.List)9 Genotype (htsjdk.variant.variantcontext.Genotype)8 GenotypeBuilder (htsjdk.variant.variantcontext.GenotypeBuilder)8 Set (java.util.Set)8 Collectors (java.util.stream.Collectors)8 Parameter (com.beust.jcommander.Parameter)7