Search in sources :

Example 16 with VCFInfoHeaderLine

use of htsjdk.variant.vcf.VCFInfoHeaderLine in project jvarkit by lindenb.

the class EigenInfoAnnotator method getAnnotations.

@Override
public Map<String, Object> getAnnotations(final VariantContext ctx) {
    if (!ctx.isVariant())
        return Collections.emptyMap();
    final int contig = contig2eigen(ctx.getContig());
    if (contig < 1)
        return Collections.emptyMap();
    try {
        if (this.codingFeatureReader == null) {
            this.codingFeatureReader = new TabixFeatureReader<>(new File(this.eigenDirectory, getTabixPrefix() + "coding_annot_04092016.tab.bgz").getPath(), new CodingFeatureCodec());
        }
        if (this.prev_contig == -1 || prev_contig != contig) {
            CloserUtil.close(this.nonCodingFeatureReader);
            this.nonCodingFeatureReader = new TabixFeatureReader<>(getNonCodingFileForContig(contig).getPath(), new NonCodingFeatureCodec());
            this.prev_contig = contig;
        }
        final Map<Allele, NonCodingFeature> alt2nonCoding = new HashMap<>();
        CloseableTribbleIterator<NonCodingFeature> iter1 = this.nonCodingFeatureReader.query(String.valueOf(contig), ctx.getStart(), ctx.getEnd());
        while (iter1.hasNext()) {
            NonCodingFeature feat = iter1.next();
            if (feat == null || !feat.accept(ctx))
                continue;
            alt2nonCoding.put(feat.alt, feat);
        }
        iter1.close();
        final Map<Allele, CodingFeature> alt2coding = new HashMap<>();
        CloseableTribbleIterator<CodingFeature> iter2 = this.codingFeatureReader.query(String.valueOf(contig), ctx.getStart(), ctx.getEnd());
        while (iter2.hasNext()) {
            CodingFeature feat = iter2.next();
            if (feat == null || !feat.accept(ctx))
                continue;
            alt2coding.put(feat.alt, feat);
        }
        iter2.close();
        if (alt2nonCoding.isEmpty() && alt2coding.isEmpty())
            return Collections.emptyMap();
        final List<Allele> alternateAlleles = ctx.getAlternateAlleles();
        final Map<String, Object> map = new HashMap<>();
        for (int side = 0; side < 2; ++side) {
            for (int i = 0; i < (side == 0 ? noncodingheaderlines.size() : codingheaderlines.size()); ++i) {
                final VCFInfoHeaderLine vihl = (side == 0 ? noncodingheaderlines.get(i) : codingheaderlines.get(i));
                final List<Object> atts = new ArrayList<>(alternateAlleles.size());
                boolean found_one = false;
                for (int altn = 0; altn < alternateAlleles.size(); ++altn) {
                    final Allele alt = alternateAlleles.get(altn);
                    final AbstractFeature feat = (side == 0 ? alt2nonCoding.get(alt) : alt2coding.get(alt));
                    if (feat == null) {
                        atts.add(".");
                        continue;
                    }
                    final String token = feat.get(4 + i);
                    if (token == null || token.isEmpty() || token.equals(".")) {
                        atts.add(".");
                        continue;
                    } else if (vihl.getType() == VCFHeaderLineType.String) {
                        found_one = true;
                        atts.add(token.replace(',', '|'));
                    } else {
                        try {
                            Float dbl = new Float(token);
                            atts.add(dbl);
                            found_one = true;
                        } catch (NumberFormatException err) {
                            throw new IOException("Cannot cast " + token + " to float for " + vihl);
                        }
                    }
                }
                if (!found_one) {
                // nothing
                } else if (vihl.getCountType() == VCFHeaderLineCount.R) {
                    map.put(vihl.getID(), new ArrayList<>(new LinkedHashSet<>(atts)));
                } else {
                    if (atts.size() != ctx.getAlternateAlleles().size()) {
                        System.err.println("Number of eigen data!=number of ALTS");
                    }
                    map.put(vihl.getID(), atts);
                }
            }
        }
        return map;
    } catch (final IOException err) {
        throw new RuntimeException(err);
    }
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) IOException(java.io.IOException) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) Allele(htsjdk.variant.variantcontext.Allele) File(java.io.File)

Example 17 with VCFInfoHeaderLine

use of htsjdk.variant.vcf.VCFInfoHeaderLine in project jvarkit by lindenb.

the class VCFFixIndels method doVcfToVcf.

@Override
protected int doVcfToVcf(String inputName, VcfIterator r, VariantContextWriter w) {
    long nChanged = 0L;
    final String TAG = "INDELFIXED";
    final VCFHeader header = r.getHeader();
    final VCFHeader h2 = new VCFHeader(header);
    addMetaData(h2);
    h2.addMetaDataLine(new VCFInfoHeaderLine(TAG, 1, VCFHeaderLineType.String, "Fix Indels for @SolenaLS (position|alleles...)"));
    w.writeHeader(h2);
    final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
    while (r.hasNext()) {
        boolean somethingChanged = false;
        VariantContext ctx = progress.watch(r.next());
        /* map old allele to new allele */
        Map<Allele, Allele> original2modified = new HashMap<Allele, Allele>();
        /* did we found a strange allele (symbolic, etc ) */
        boolean strange_allele_flag = false;
        for (final Allele a : ctx.getAlleles()) {
            original2modified.put(a, a);
            if (a.isSymbolic() || a.isNoCall()) {
                strange_allele_flag = true;
                break;
            }
        }
        if (strange_allele_flag || original2modified.size() < 2) /* at least 2 alleles: REF+ ALT */
        {
            w.add(ctx);
            continue;
        }
        /* record chromStart if we need to shift to the right */
        int chromStart = ctx.getStart();
        /* trim right then left  */
        for (int side = 0; side < 2; ++side) {
            boolean done = false;
            while (!done) {
                boolean ok_side = true;
                /* common nucleotide at end/start */
                Character targetChar = null;
                done = true;
                // scan side
                Set<Allele> keys = new HashSet<>(original2modified.keySet());
                for (Allele k : keys) {
                    Allele newAllele = original2modified.get(k);
                    if (newAllele.isSymbolic()) {
                        ok_side = false;
                        break;
                    }
                    String baseString = newAllele.getBaseString().trim().toUpperCase();
                    if (baseString.length() < 2) {
                        ok_side = false;
                        break;
                    }
                    /* first or last char or all sequences
						 * side==0 : right
						 * side==1 : left
						 * */
                    Character baseChar = (side == 0 ? baseString.charAt(baseString.length() - 1) : baseString.charAt(0));
                    if (targetChar == null) {
                        targetChar = baseChar;
                    } else if (!targetChar.equals(baseChar)) {
                        /* doesn't end with same nucleotide */
                        ok_side = false;
                        break;
                    }
                }
                /* ok we can shift all alleles */
                if (ok_side && targetChar != null) {
                    done = false;
                    somethingChanged = true;
                    for (Allele k : keys) {
                        Allele newAllele = original2modified.get(k);
                        String baseString = newAllele.getBaseString().trim().toUpperCase();
                        if (// remove last nucleotide
                        side == 0) {
                            newAllele = Allele.create(baseString.substring(0, baseString.length() - 1), newAllele.isReference());
                        } else {
                            newAllele = Allele.create(baseString.substring(1), newAllele.isReference());
                        }
                        original2modified.put(k, newAllele);
                    }
                    if (side == 1)
                        chromStart++;
                }
            }
        /* end of while done */
        }
        if (!somethingChanged) {
            w.add(ctx);
            continue;
        }
        final VariantContextBuilder b = new VariantContextBuilder(ctx);
        b.start(chromStart);
        Allele newRef = original2modified.get(ctx.getReference());
        b.stop(chromStart + newRef.getBaseString().length() - 1);
        b.alleles(original2modified.values());
        List<Genotype> genotypes = new ArrayList<>();
        for (final String sample : header.getSampleNamesInOrder()) {
            final Genotype g = ctx.getGenotype(sample);
            if (!g.isCalled()) {
                genotypes.add(g);
                continue;
            }
            final GenotypeBuilder gb = new GenotypeBuilder(g);
            final List<Allele> aL = new ArrayList<>();
            for (final Allele a : g.getAlleles()) {
                aL.add(original2modified.get(a));
            }
            gb.alleles(aL);
            genotypes.add(gb.make());
        }
        StringBuilder tagContent = new StringBuilder();
        tagContent.append(String.valueOf(ctx.getStart()));
        for (Allele a : ctx.getAlleles()) {
            tagContent.append("|");
            tagContent.append(a.toString());
        }
        b.attribute(TAG, tagContent.toString());
        b.genotypes(genotypes);
        w.add(b.make());
        ++nChanged;
        if (w.checkError())
            break;
    }
    progress.finish();
    LOG.info("indels changed:" + nChanged);
    return RETURN_OK;
}
Also used : SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) Genotype(htsjdk.variant.variantcontext.Genotype) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) Allele(htsjdk.variant.variantcontext.Allele) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet)

Example 18 with VCFInfoHeaderLine

use of htsjdk.variant.vcf.VCFInfoHeaderLine in project jvarkit by lindenb.

the class VCFComm method doWork.

@Override
public int doWork(final List<String> args) {
    CloseableIterator<LineAndFile> iter = null;
    SortingCollection<LineAndFile> variants = null;
    VariantContextWriter w = null;
    try {
        if (args.isEmpty()) {
            LOG.error("Illegal number of arguments");
            return -1;
        }
        Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
        variants = SortingCollection.newInstance(LineAndFile.class, new LineAndFileCodec(), new LineAndFileComparator(), super.sortingCollectionArgs.getMaxRecordsInRam(), super.sortingCollectionArgs.getTmpPaths());
        variants.setDestructiveIteration(true);
        /**
         * new sample names in the output vcf: one  sample per file
         */
        final Map<Integer, String> fileid2sampleName = new TreeMap<>();
        /**
         * samples names as they appear in the original VCF headers
         */
        final Counter<String> countInputSamples = new Counter<String>();
        /**
         * dicts
         */
        final List<SAMSequenceDictionary> all_dictionaries = new ArrayList<>();
        for (final String vcffilename : IOUtils.unrollFiles(args)) {
            LOG.info("Reading from " + vcffilename);
            final Input input = super.put(variants, vcffilename);
            String sampleName = vcffilename;
            if (sampleName.endsWith(".vcf.gz")) {
                sampleName = sampleName.substring(0, sampleName.length() - 7);
            } else if (sampleName.endsWith(".vcf.gz")) {
                sampleName = sampleName.substring(0, sampleName.length() - 4);
            }
            int slash = sampleName.lastIndexOf(File.separatorChar);
            if (slash != -1)
                sampleName = sampleName.substring(slash + 1);
            int suffix = 1;
            // loop until we find a uniq name
            for (; ; ) {
                final String key = sampleName + (suffix == 1 ? "" : "_" + suffix);
                if (fileid2sampleName.values().contains(key)) {
                    suffix++;
                    continue;
                }
                fileid2sampleName.put(input.file_id, key);
                metaData.add(new VCFHeaderLine(key, vcffilename));
                break;
            }
            for (final String sname : input.codecAndHeader.header.getSampleNamesInOrder()) {
                countInputSamples.incr(sname);
            }
            all_dictionaries.add(input.codecAndHeader.header.getSequenceDictionary());
        }
        variants.doneAdding();
        /**
         * unique sample name, if any present in all VCF
         */
        Optional<String> unqueSampleName = Optional.empty();
        if (countInputSamples.getCountCategories() == 1 && countInputSamples.count(countInputSamples.keySet().iterator().next()) == fileid2sampleName.size()) {
            unqueSampleName = Optional.of(countInputSamples.keySet().iterator().next());
            LOG.info("Unique sample name is " + unqueSampleName.get());
        }
        VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.DEPTH_KEY, VCFConstants.GENOTYPE_QUALITY_KEY, VCFConstants.GENOTYPE_KEY, VCFConstants.GENOTYPE_FILTER_KEY);
        VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.DEPTH_KEY, VCFConstants.ALLELE_COUNT_KEY, VCFConstants.ALLELE_NUMBER_KEY);
        metaData.add(new VCFHeaderLine(getClass().getSimpleName(), "version:" + getVersion() + " command:" + getProgramCommandLine()));
        final VCFFilterHeaderLine variantNotCalledInAllVcf = new VCFFilterHeaderLine("NotCalledEveryWhere", "Variant was NOT called in all input VCF");
        metaData.add(variantNotCalledInAllVcf);
        final VCFFilterHeaderLine variantWasFiltered = new VCFFilterHeaderLine("VariantWasFiltered", "At least one variant was filtered");
        metaData.add(variantWasFiltered);
        final VCFFormatHeaderLine variantQUALFormat = new VCFFormatHeaderLine("VCQUAL", 1, VCFHeaderLineType.Float, "Variant Quality");
        metaData.add(variantQUALFormat);
        metaData.add(new VCFFormatHeaderLine(VCFConstants.ALLELE_NUMBER_KEY, 1, VCFHeaderLineType.Integer, "Number of allle in the src vcf"));
        metaData.add(new VCFFormatHeaderLine(VCFConstants.ALLELE_COUNT_KEY, 1, VCFHeaderLineType.Integer, "Number of ALT alllele"));
        final VCFInfoHeaderLine foundInCountVcfInfo = new VCFInfoHeaderLine("NVCF", 1, VCFHeaderLineType.Integer, "Number of VCF this variant was found");
        metaData.add(foundInCountVcfInfo);
        final VCFInfoHeaderLine variantTypesInfo = new VCFInfoHeaderLine("VTYPES", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Distinct Variants type");
        metaData.add(variantTypesInfo);
        final VCFFilterHeaderLine multipleTypeFilters = new VCFFilterHeaderLine("DiscordantTypes", "Discordant types at this position");
        metaData.add(multipleTypeFilters);
        final VCFFormatHeaderLine variantTypeFormat = new VCFFormatHeaderLine("VTYPE", 1, VCFHeaderLineType.String, "Variant Type");
        metaData.add(variantTypeFormat);
        final VCFFilterHeaderLine uniqueVariantDiscordantGTFilter;
        if (unqueSampleName.isPresent()) {
            metaData.add(new VCFHeaderLine("UniqSample", unqueSampleName.get()));
            uniqueVariantDiscordantGTFilter = new VCFFilterHeaderLine("DiscordantGenotypeForUniqSample", "Genotype Dicordant for for sample " + unqueSampleName.get());
            metaData.add(uniqueVariantDiscordantGTFilter);
        } else {
            uniqueVariantDiscordantGTFilter = null;
        }
        final VCFHeader header = new VCFHeader(metaData, new ArrayList<>(fileid2sampleName.values()));
        if (// all have a dict
        !normalize_chr && !all_dictionaries.contains(null)) {
            SAMSequenceDictionary thedict = null;
            for (int x = 0; x < all_dictionaries.size(); ++x) {
                SAMSequenceDictionary d = all_dictionaries.get(x);
                if (thedict == null) {
                    thedict = d;
                } else if (!SequenceUtil.areSequenceDictionariesEqual(d, thedict)) {
                    thedict = null;
                    break;
                }
            }
            if (thedict != null)
                header.setSequenceDictionary(thedict);
        }
        w = super.openVariantContextWriter(super.outputFile);
        w.writeHeader(header);
        final List<LineAndFile> row = new ArrayList<LineAndFile>(super.inputs.size());
        final Comparator<LineAndFile> posCompare = (A, B) -> A.getContigPosRef().compareTo(B.getContigPosRef());
        iter = variants.iterator();
        for (; ; ) {
            LineAndFile rec = null;
            if (iter.hasNext()) {
                rec = iter.next();
            }
            if (rec == null || (!row.isEmpty() && posCompare.compare(row.get(0), rec) != 0)) {
                if (!row.isEmpty()) {
                    final VariantContext first = row.get(0).getContext();
                    /* in which file id we find this variant */
                    Set<Integer> fileids_for_variant = row.stream().map(LAF -> LAF.fileIdx).collect(Collectors.toSet());
                    // see with HAS multiple chrom/pos/ref but different alt
                    if (row.size() != fileids_for_variant.size()) {
                        for (; ; ) {
                            boolean ok = true;
                            for (int x = 0; ok && x + 1 < row.size(); ++x) {
                                final VariantContext ctxx = row.get(x).getContext();
                                final List<Allele> altsx = ctxx.getAlternateAlleles();
                                for (int y = x + 1; ok && y < row.size(); ++y) {
                                    if (row.get(x).fileIdx != row.get(y).fileIdx)
                                        continue;
                                    final VariantContext ctxy = row.get(y).getContext();
                                    final List<Allele> altsy = ctxy.getAlternateAlleles();
                                    if (altsx.equals(altsy))
                                        continue;
                                    if (!ctxx.isVariant() && ctxy.isVariant()) {
                                        row.remove(x);
                                    } else if (ctxx.isVariant() && !ctxy.isVariant()) {
                                        row.remove(y);
                                    } else if (!ctxx.isSNP() && ctxy.isSNP()) {
                                        row.remove(x);
                                    } else if (ctxx.isSNP() && !ctxy.isSNP()) {
                                        row.remove(y);
                                    } else if (altsx.size() > altsy.size()) {
                                        row.remove(x);
                                    } else if (altsx.size() < altsy.size()) {
                                        row.remove(y);
                                    } else {
                                        row.remove(y);
                                    }
                                    ok = false;
                                    break;
                                }
                            }
                            if (ok)
                                break;
                        }
                        fileids_for_variant = row.stream().map(LAF -> LAF.fileIdx).collect(Collectors.toSet());
                    }
                    if (row.size() != fileids_for_variant.size()) {
                        LOG.error("There are some duplicated variants at the position " + new ContigPosRef(first) + " in the same vcf file");
                        for (final LineAndFile laf : row) {
                            LOG.error("File [" + laf.fileIdx + "]" + fileid2sampleName.get(laf.fileIdx));
                            LOG.error("\t" + laf.getContigPosRef());
                        }
                        row.clear();
                    } else {
                        final Set<Allele> alleles = row.stream().flatMap(R -> R.getContext().getAlleles().stream()).collect(Collectors.toSet());
                        final VariantContextBuilder vcb = new VariantContextBuilder(getClass().getName(), first.getContig(), first.getStart(), first.getEnd(), alleles);
                        final Set<String> filters = new HashSet<>();
                        final Set<VariantContext.Type> variantContextTypes = new HashSet<>();
                        final List<Genotype> genotypes = new ArrayList<Genotype>();
                        for (final LineAndFile laf : row) {
                            if (laf.getContext().isFiltered())
                                filters.add(variantWasFiltered.getID());
                            variantContextTypes.add(laf.getContext().getType());
                            final GenotypeBuilder gbuilder = new GenotypeBuilder();
                            gbuilder.name(fileid2sampleName.get(laf.fileIdx));
                            if (unqueSampleName.isPresent()) {
                                final Genotype g0 = laf.getContext().getGenotype(unqueSampleName.get());
                                if (g0 == null) {
                                    iter.close();
                                    w.close();
                                    throw new IllegalStateException("Cannot find genotype for " + unqueSampleName.get());
                                }
                                if (g0.hasDP())
                                    gbuilder.DP(g0.getDP());
                                if (g0.hasGQ())
                                    gbuilder.GQ(g0.getGQ());
                                gbuilder.alleles(g0.getAlleles());
                            } else {
                                gbuilder.alleles(Arrays.asList(first.getReference(), first.getReference()));
                                if (laf.getContext().hasAttribute(VCFConstants.DEPTH_KEY)) {
                                    gbuilder.DP(laf.getContext().getAttributeAsInt(VCFConstants.DEPTH_KEY, 0));
                                }
                            }
                            if (laf.getContext().isFiltered()) {
                                gbuilder.filter("VCFFILTERED");
                            }
                            if (laf.getContext().hasLog10PError()) {
                                gbuilder.attribute(variantQUALFormat.getID(), laf.getContext().getPhredScaledQual());
                            }
                            gbuilder.attribute(VCFConstants.ALLELE_NUMBER_KEY, laf.getContext().getGenotypes().stream().flatMap(G -> G.getAlleles().stream()).filter(A -> !A.isNoCall()).count());
                            gbuilder.attribute(VCFConstants.ALLELE_COUNT_KEY, laf.getContext().getGenotypes().stream().flatMap(G -> G.getAlleles().stream()).filter(A -> !(A.isReference() || A.isNoCall())).count());
                            gbuilder.attribute(variantTypeFormat.getID(), laf.getContext().getType().name());
                            genotypes.add(gbuilder.make());
                        }
                        final String id = String.join(";", row.stream().map(LAF -> LAF.getContext()).filter(V -> V.hasID()).map(V -> V.getID()).collect(Collectors.toSet()));
                        if (!id.isEmpty())
                            vcb.id(id);
                        vcb.genotypes(genotypes);
                        if (unqueSampleName.isPresent()) {
                            boolean all_same = true;
                            for (int x = 0; all_same && x + 1 < genotypes.size(); ++x) {
                                if (!genotypes.get(x).isCalled())
                                    continue;
                                for (int y = x + 1; all_same && y < genotypes.size(); ++y) {
                                    if (!genotypes.get(y).isCalled())
                                        continue;
                                    if (!genotypes.get(x).sameGenotype(genotypes.get(y), true)) {
                                        all_same = false;
                                        break;
                                    }
                                }
                            }
                            if (!all_same)
                                filters.add(uniqueVariantDiscordantGTFilter.getID());
                        }
                        // Add AN
                        vcb.attribute(VCFConstants.ALLELE_NUMBER_KEY, genotypes.stream().filter(G -> G.isCalled()).mapToInt(G -> G.getAlleles().size()).sum());
                        if (!variantContextTypes.isEmpty()) {
                            vcb.attribute(variantTypesInfo.getID(), new ArrayList<>(variantContextTypes.stream().map(T -> T.name()).collect(Collectors.toSet())));
                            if (variantContextTypes.size() > 1) {
                                filters.add(multipleTypeFilters.getID());
                            }
                        }
                        vcb.attribute(foundInCountVcfInfo.getID(), fileids_for_variant.size());
                        boolean print = true;
                        if (row.size() == super.inputs.size() && ignore_everywhere) {
                            print = false;
                        }
                        if (fileids_for_variant.size() != fileid2sampleName.size()) {
                            filters.add(variantNotCalledInAllVcf.getID());
                            if (only_everywhere) {
                                print = false;
                            }
                        }
                        vcb.filters(filters);
                        if (print) {
                            w.add(vcb.make());
                        }
                    }
                    row.clear();
                }
                if (rec == null)
                    break;
            }
            row.add(rec);
        }
        iter.close();
        iter = null;
        w.close();
        w = null;
        return 0;
    } catch (Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(iter);
        CloserUtil.close(w);
        try {
            if (variants != null)
                variants.cleanup();
        } catch (Exception err) {
        }
    }
}
Also used : Genotype(htsjdk.variant.variantcontext.Genotype) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) CloseableIterator(htsjdk.samtools.util.CloseableIterator) Allele(htsjdk.variant.variantcontext.Allele) Arrays(java.util.Arrays) SequenceUtil(htsjdk.samtools.util.SequenceUtil) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) VCFStandardHeaderLines(htsjdk.variant.vcf.VCFStandardHeaderLines) VCFHeader(htsjdk.variant.vcf.VCFHeader) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) ContigPosRef(com.github.lindenb.jvarkit.util.vcf.ContigPosRef) Map(java.util.Map) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) CloserUtil(htsjdk.samtools.util.CloserUtil) VCFConstants(htsjdk.variant.vcf.VCFConstants) Counter(com.github.lindenb.jvarkit.util.Counter) SortingCollection(htsjdk.samtools.util.SortingCollection) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Logger(com.github.lindenb.jvarkit.util.log.Logger) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Set(java.util.Set) Collectors(java.util.stream.Collectors) File(java.io.File) List(java.util.List) TreeMap(java.util.TreeMap) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) Optional(java.util.Optional) VariantContext(htsjdk.variant.variantcontext.VariantContext) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) Pattern(java.util.regex.Pattern) Comparator(java.util.Comparator) VCFHeaderLineCount(htsjdk.variant.vcf.VCFHeaderLineCount) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Counter(com.github.lindenb.jvarkit.util.Counter) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) ContigPosRef(com.github.lindenb.jvarkit.util.vcf.ContigPosRef) Genotype(htsjdk.variant.variantcontext.Genotype) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) TreeMap(java.util.TreeMap) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) Allele(htsjdk.variant.variantcontext.Allele) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder)

Example 19 with VCFInfoHeaderLine

use of htsjdk.variant.vcf.VCFInfoHeaderLine in project jvarkit by lindenb.

the class VcfLiftOver method doVcfToVcf.

@Override
protected int doVcfToVcf(String inputName, VcfIterator in, VariantContextWriter out) {
    VariantContextWriter failed = null;
    GenomicSequence genomicSequence = null;
    try {
        final VCFHeader inputHeader = in.getHeader();
        final Set<VCFHeaderLine> headerLines = inputHeader.getMetaDataInInputOrder().stream().filter(V -> {
            if (!(V instanceof VCFInfoHeaderLine))
                return true;
            final VCFInfoHeaderLine vih = VCFInfoHeaderLine.class.cast(V);
            if (removeInfo.contains(vih.getID()))
                return false;
            return true;
        }).collect(Collectors.toSet());
        if (this.failedFile != null) {
            final VCFHeader header2 = new VCFHeader(headerLines, inputHeader.getSampleNamesInOrder());
            header2.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "CmdLine", String.valueOf(getProgramCommandLine())));
            header2.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "Version", String.valueOf(getVersion())));
            header2.addMetaDataLine(new VCFInfoHeaderLine(this.failedinfoTag, 1, VCFHeaderLineType.String, "Why the liftOver failed."));
            failed = super.openVariantContextWriter(failedFile);
            failed.writeHeader(header2);
        }
        final VCFHeader header3 = new VCFHeader(headerLines, inputHeader.getSampleNamesInOrder());
        header3.setSequenceDictionary(this.indexedFastaSequenceFile.getSequenceDictionary());
        header3.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "CmdLine", String.valueOf(getProgramCommandLine())));
        header3.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "Version", String.valueOf(getVersion())));
        header3.addMetaDataLine(new VCFInfoHeaderLine(this.infoTag, 1, VCFHeaderLineType.String, "Chromosome|Position before liftOver."));
        out.writeHeader(header3);
        final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(in.getHeader());
        while (in.hasNext()) {
            VariantContext ctx = progress.watch(in.next());
            if (!this.removeInfo.isEmpty()) {
                VariantContextBuilder vcb = new VariantContextBuilder(ctx);
                for (final String tag : this.removeInfo) vcb.rmAttribute(tag);
                ctx = vcb.make();
            }
            if (ctx.isIndel() && this.ignoreIndels) {
                if (failed != null)
                    failed.add(new VariantContextBuilder(ctx).attribute(this.failedinfoTag, "Indel").make());
                continue;
            }
            if (adaptivematch) {
                double minAlleleLength = Math.min(0, ctx.getAlleles().stream().mapToInt(A -> A.length()).min().orElse(0));
                double maxAlleleLength = Math.max(1, ctx.getAlleles().stream().mapToInt(A -> A.length()).max().orElse(1));
                this.liftOver.setLiftOverMinMatch(minAlleleLength / maxAlleleLength);
            }
            final Interval lifted = liftOver.liftOver(new Interval(ctx.getContig(), ctx.getStart(), ctx.getEnd(), // negative strand
            false, String.join("|", ctx.getContig(), String.valueOf(ctx.getStart()), ctx.getReference().toString())));
            if (lifted == null) {
                if (failed != null)
                    failed.add(new VariantContextBuilder(ctx).attribute(this.failedinfoTag, "LiftOverFailed").make());
            } else if (this.indexedFastaSequenceFile.getSequenceDictionary().getSequence(lifted.getContig()) == null) {
                if (failed != null)
                    failed.add(new VariantContextBuilder(ctx).attribute(this.failedinfoTag, "ContigMissingDictionary|" + lifted.getContig()).make());
            } else {
                boolean alleleAreValidatedVsRef = true;
                // part of the code was copied from picard/liftovervcf
                final Map<Allele, Allele> reverseComplementAlleleMap = new HashMap<>();
                final List<Allele> alleles = new ArrayList<Allele>();
                for (final Allele oldAllele : ctx.getAlleles()) {
                    final Allele fixedAllele;
                    if (oldAllele.isSymbolic() || oldAllele.isNoCall() || oldAllele.equals(Allele.SPAN_DEL)) {
                        alleles.add(oldAllele);
                        continue;
                    } else if (lifted.isPositiveStrand()) {
                        fixedAllele = oldAllele;
                        alleles.add(oldAllele);
                    } else {
                        fixedAllele = Allele.create(SequenceUtil.reverseComplement(oldAllele.getBaseString()), oldAllele.isReference());
                        alleles.add(fixedAllele);
                        reverseComplementAlleleMap.put(oldAllele, fixedAllele);
                    }
                    if (this.checkAlleleSequence) {
                        if (genomicSequence == null || !genomicSequence.getChrom().equals(lifted.getContig())) {
                            genomicSequence = new GenomicSequence(this.indexedFastaSequenceFile, lifted.getContig());
                        }
                        final String alleleStr = fixedAllele.getBaseString();
                        int x = 0;
                        while (x < alleleStr.length() && lifted.getStart() - 1 + x < genomicSequence.length()) {
                            final char refChar = genomicSequence.charAt(lifted.getStart() - 1 + x);
                            if (Character.toLowerCase(refChar) != Character.toLowerCase(alleleStr.charAt(x))) {
                                alleleAreValidatedVsRef = false;
                                break;
                            }
                            ++x;
                        }
                        if (x != alleleStr.length()) {
                            alleleAreValidatedVsRef = false;
                            break;
                        }
                    }
                }
                if (!alleleAreValidatedVsRef) {
                    if (failed != null)
                        failed.add(new VariantContextBuilder(ctx).attribute(this.failedinfoTag, "AlleleMismatchRef").make());
                    continue;
                }
                if (lifted.getEnd() - lifted.getStart() != ctx.getEnd() - ctx.getStart()) {
                    if (failed != null)
                        failed.add(new VariantContextBuilder(ctx).attribute(this.failedinfoTag, "AlleleBadLength|" + lifted.length()).make());
                    continue;
                }
                final VariantContextBuilder vcb = new VariantContextBuilder(ctx.getSource(), lifted.getContig(), lifted.getStart(), lifted.getEnd(), alleles);
                vcb.id(ctx.getID());
                vcb.attributes(ctx.getAttributes());
                vcb.attribute(this.infoTag, ctx.getContig() + "|" + ctx.getStart() + "|" + ctx.getReference().getDisplayString());
                vcb.filters(ctx.getFilters());
                vcb.log10PError(ctx.getLog10PError());
                final GenotypesContext genotypeContext = ctx.getGenotypes();
                final GenotypesContext fixedGenotypes = GenotypesContext.create(genotypeContext.size());
                for (final Genotype genotype : genotypeContext) {
                    final List<Allele> fixedAlleles = new ArrayList<Allele>();
                    for (final Allele allele : genotype.getAlleles()) {
                        final Allele fixedAllele = reverseComplementAlleleMap.containsKey(allele) ? reverseComplementAlleleMap.get(allele) : allele;
                        fixedAlleles.add(fixedAllele);
                    }
                    fixedGenotypes.add(new GenotypeBuilder(genotype).alleles(fixedAlleles).make());
                }
                vcb.genotypes(fixedGenotypes);
                out.add(vcb.make());
            }
        }
        if (failed != null) {
            failed.close();
            failed = null;
        }
        return RETURN_OK;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(failed);
    }
}
Also used : Genotype(htsjdk.variant.variantcontext.Genotype) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) Allele(htsjdk.variant.variantcontext.Allele) SequenceUtil(htsjdk.samtools.util.SequenceUtil) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) VCFHeader(htsjdk.variant.vcf.VCFHeader) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) HashMap(java.util.HashMap) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) LiftOver(htsjdk.samtools.liftover.LiftOver) Interval(htsjdk.samtools.util.Interval) Map(java.util.Map) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) CloserUtil(htsjdk.samtools.util.CloserUtil) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Logger(com.github.lindenb.jvarkit.util.log.Logger) VcfIterator(com.github.lindenb.jvarkit.util.vcf.VcfIterator) Set(java.util.Set) Collectors(java.util.stream.Collectors) File(java.io.File) GenotypesContext(htsjdk.variant.variantcontext.GenotypesContext) List(java.util.List) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) VariantContext(htsjdk.variant.variantcontext.VariantContext) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) GenotypesContext(htsjdk.variant.variantcontext.GenotypesContext) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) VariantContext(htsjdk.variant.variantcontext.VariantContext) Genotype(htsjdk.variant.variantcontext.Genotype) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) Allele(htsjdk.variant.variantcontext.Allele) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) ArrayList(java.util.ArrayList) List(java.util.List) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashMap(java.util.HashMap) Map(java.util.Map) Interval(htsjdk.samtools.util.Interval)

Example 20 with VCFInfoHeaderLine

use of htsjdk.variant.vcf.VCFInfoHeaderLine in project jvarkit by lindenb.

the class VcfJaspar method doVcfToVcf.

@Override
protected int doVcfToVcf(String inputName, VcfIterator in, VariantContextWriter out) {
    final String ATT = "JASPAR";
    GenomicSequence genomicSequence = null;
    final VCFHeader header = new VCFHeader(in.getHeader());
    addMetaData(header);
    header.addMetaDataLine(new VCFInfoHeaderLine(ATT, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Jaspar pattern overlapping: Format: (Name|length|Score/1000|pos|strand)"));
    out.writeHeader(header);
    while (in.hasNext()) {
        VariantContext var = in.next();
        if (genomicSequence == null || !genomicSequence.getChrom().equals(var.getContig())) {
            LOG.info("Loading sequence " + var.getContig());
            genomicSequence = new GenomicSequence(this.indexedFastaSequenceFile, var.getContig());
        }
        final Set<String> hits = new HashSet<String>();
        for (final Matrix matrix : this.jasparDb) {
            int start0 = Math.max(0, var.getStart() - matrix.length());
            for (int y = start0; y < var.getStart() && y + matrix.length() <= genomicSequence.length(); ++y) {
                CharSequence forward = new SubSequence(genomicSequence, y, y + matrix.length());
                CharSequence revcomp = new RevCompCharSequence(forward);
                // run each strand
                for (int strand = 0; strand < 2; ++strand) {
                    double score = matrix.score(strand == 0 ? forward : revcomp);
                    if (score <= 0)
                        continue;
                    if (score >= matrix.max() * this.fraction_of_max) {
                        StringBuilder b = new StringBuilder("(");
                        b.append(matrix.getName().replaceAll("[ \t;=]+", "/"));
                        b.append("|");
                        b.append(matrix.length());
                        b.append("|");
                        b.append((int) (1000.0 * (score / matrix.max())));
                        b.append("|");
                        b.append(y + 1);
                        b.append("|");
                        b.append(strand == 0 ? '+' : '-');
                        b.append(")");
                        hits.add(b.toString());
                        break;
                    }
                }
            }
        }
        if (hits.isEmpty()) {
            out.add(var);
            continue;
        }
        final VariantContextBuilder vcb = new VariantContextBuilder(var);
        vcb.attribute(ATT, hits.toArray(new String[hits.size()]));
        out.add(vcb.make());
    }
    return RETURN_OK;
}
Also used : GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) RevCompCharSequence(com.github.lindenb.jvarkit.util.bio.RevCompCharSequence) VariantContext(htsjdk.variant.variantcontext.VariantContext) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) SubSequence(com.github.lindenb.jvarkit.lang.SubSequence) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) RevCompCharSequence(com.github.lindenb.jvarkit.util.bio.RevCompCharSequence) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet)

Aggregations

VCFInfoHeaderLine (htsjdk.variant.vcf.VCFInfoHeaderLine)55 VCFHeader (htsjdk.variant.vcf.VCFHeader)49 VariantContext (htsjdk.variant.variantcontext.VariantContext)37 VariantContextBuilder (htsjdk.variant.variantcontext.VariantContextBuilder)37 ArrayList (java.util.ArrayList)34 HashSet (java.util.HashSet)32 VCFHeaderLine (htsjdk.variant.vcf.VCFHeaderLine)31 SAMSequenceDictionaryProgress (com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress)25 VariantContextWriter (htsjdk.variant.variantcontext.writer.VariantContextWriter)25 Allele (htsjdk.variant.variantcontext.Allele)22 IOException (java.io.IOException)20 File (java.io.File)19 Genotype (htsjdk.variant.variantcontext.Genotype)17 GenotypeBuilder (htsjdk.variant.variantcontext.GenotypeBuilder)17 Set (java.util.Set)17 HashMap (java.util.HashMap)16 List (java.util.List)16 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)14 VCFFilterHeaderLine (htsjdk.variant.vcf.VCFFilterHeaderLine)14 VCFFormatHeaderLine (htsjdk.variant.vcf.VCFFormatHeaderLine)14